O1 R Masters :)

O1 R Masters :)

0. Source loading

source('notebooks/libraries.R')

1. Data set

1.1 Data loading

raw_train_data<-fread('Data/BankCamp_train.csv', stringsAsFactors = F)
raw_test_data<-fread('Data/BankCamp_test.csv', stringsAsFactors = F)

str(raw_train_data)
## Classes 'data.table' and 'data.frame':   36168 obs. of  17 variables:
##  $ age      : int  50 47 56 36 41 32 26 60 39 55 ...
##  $ job      : chr  "entrepreneur" "technician" "housemaid" "blue-collar" ...
##  $ marital  : chr  "married" "married" "married" "married" ...
##  $ education: chr  "primary" "secondary" "primary" "primary" ...
##  $ default  : chr  "yes" "no" "no" "no" ...
##  $ balance  : int  537 -938 605 4608 362 0 782 193 2140 873 ...
##  $ housing  : chr  "yes" "yes" "no" "yes" ...
##  $ loan     : chr  "no" "no" "no" "no" ...
##  $ contact  : chr  "unknown" "unknown" "cellular" "cellular" ...
##  $ day      : int  20 28 19 14 12 4 29 12 16 3 ...
##  $ month    : chr  "jun" "may" "aug" "may" ...
##  $ duration : int  11 176 207 284 217 233 297 89 539 131 ...
##  $ campaign : int  15 2 6 7 3 3 1 2 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 276 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 2 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(raw_test_data)
## Classes 'data.table' and 'data.frame':   9043 obs. of  16 variables:
##  $ age      : int  58 43 51 56 32 54 58 54 32 38 ...
##  $ job      : chr  "management" "technician" "retired" "management" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "primary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 593 229 779 23 529 -364 1291 0 424 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "no" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 55 353 164 160 1492 355 266 179 104 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Classifying the variables between discrete and continuous variables

discrete_var<-c("job", "marital", "education", "default", "housing", "loan","contact", "month", "poutcome")
  
continuous_var<-c("age", "balance","day", "duration", "campaign", "pdays", "previous")

2. Target distribution

target_dist<-ggplot(raw_train_data, aes(y, fill=y))+
              geom_bar()+
              scale_fill_manual(values=c("#995052", "#529950"))+
              theme_minimal() +
              xlab("target - campaign offer accepted")+
              ggtitle("Distribution of target variable") +
              theme(text = element_text(face = "bold"),
              legend.position = "none",
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
              plot.title = element_text(hjust = 0.5))
target_dist

3. Distribution of the continous variables

cont_var <- as.data.frame(raw_train_data)
cont_var <- cont_var[ , (names(cont_var) %in% continuous_var)]
distribution <- as.data.frame(t(sapply(cont_var, quantile)))
distribution$Mean <- sapply(cont_var, mean)
distribution$SD <- sapply(cont_var, sd)
datatable(round(distribution, 2))

3.1 Overview of the distribution of all the continuous variables

cont_var_melt <- as.data.frame(melt(cont_var))

cont_dist <- ggplot(cont_var_melt, aes(value)) +
        geom_density(aes(fill = variable)) +
        facet_wrap(~variable, scales = "free", nrow = 3) +
        labs(x = "", y = "", fill = "") +
        theme_minimal() +
        scale_fill_tableau() +
        ggtitle("Distribution of each continous variable") +
        theme(text = element_text(face = "bold"),
              legend.position = "none",
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
             plot.title = element_text(hjust = 0.5))
cont_dist

3.2 Distribution of each continuous variable

## Create a new column in the test dataset 
raw_test_data$y <- NA

## Creating a column "dataType" for both train and test datasets and assign the value 'train' & 'test'
raw_train_data$dataType <- "train"
raw_test_data$dataType <- "test"


## Merging both train and test datasets 
dataset <- rbind(raw_train_data, raw_test_data)

Comparing train and test distirbution of continuous features

Age

ggplot(dataset, aes(x=age, color = dataType)) +
        geom_density(alpha = 0.7) +
        ggtitle("Age Distribution") + theme_classic() +
        scale_color_manual(values=c("#e08926", "#3526e0"))

Balance

ggplot(dataset, aes(x=balance, color = dataType)) +
        geom_density(alpha = 0.7) +
        ggtitle("Balance Distribution") + theme_classic() +
        scale_color_manual(values=c("#e08926", "#3526e0"))

Day

ggplot(dataset, aes(x=day, color = dataType)) +
        geom_density(alpha = 0.7) +
        ggtitle("Day Distribution") + theme_classic() +
        scale_color_manual(values=c("#e08926", "#3526e0"))

Duration

ggplot(dataset, aes(x=duration, color = dataType)) +
        geom_density(alpha = 0.7) +
        ggtitle("Duration Distribution") + theme_classic() +
        scale_color_manual(values=c("#e08926", "#3526e0"))

Campaign

ggplot(dataset, aes(x=campaign, color = dataType)) +
        geom_density(alpha = 0.7) +
        ggtitle("Campaign Distribution") + theme_classic() +
        scale_color_manual(values=c("#e08926", "#3526e0"))

pdays

ggplot(dataset, aes(x=pdays, color = dataType)) +
        geom_density(alpha = 0.7) +
        ggtitle("pdays Distribution") + theme_classic() +
        scale_color_manual(values=c("#e08926", "#3526e0"))

4. Distribution of the discrete variables

4.1 Overview of the distribution of the discrete variables

df_disc <- raw_train_data[, ..discrete_var]
df_disc <- sapply(df_disc, as.factor)
df_disc <- as.data.frame(melt(df_disc))

disc_dist <- ggplot(df_disc, aes(value)) +
      geom_bar(aes(fill = Var2)) + 
      scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
      scale_x_discrete(expand = c(0,0)) +
      facet_wrap(~Var2, scales = "free", nrow = 2) +
      scale_fill_tableau() +
      ggtitle("Count of each discrete variable") +
      labs(fill = "", x = "", y = "") +
      theme_minimal() +
      theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

disc_dist

4.2 Distribution of each discrete variable

Comparing train and test distirbution of discrete features

Job

job_train <- ggplot(raw_train_data, aes(x=job)) +
        geom_bar(fill ="#3526e0") +
        ggtitle("Job Distribution - Train") + labs(y = "", x="") + theme_minimal() + 
        theme(text = element_text(face = "plain"), legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

job_test <-  ggplot(raw_test_data, aes(x=job)) +
        geom_bar(fill = "#e08926") + ggtitle("Job Distribution - Test") + labs(y = "", x="") +         theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",  
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(job_train, job_test, ncol=2)

Marital

marital_train <- ggplot(raw_train_data, aes(x=marital)) +
        geom_bar(fill ="#3526e0") + ggtitle("Marital Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

marital_test <-  ggplot(raw_test_data, aes(x=marital)) +
        geom_bar(fill = "#e08926") + ggtitle("Marital Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(marital_train, marital_test, ncol=2)

Education

education_train <- ggplot(raw_train_data, aes(x=education)) +
        geom_bar(fill ="#3526e0") + ggtitle("Education Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

education_test <-  ggplot(raw_test_data, aes(x=education)) +
        geom_bar(fill = "#e08926") + ggtitle("Education Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(education_train, education_test, ncol=2)

Default

default_train <- ggplot(raw_train_data, aes(x=default)) +
        geom_bar(fill ="#3526e0") + ggtitle("Default Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() +    theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

default_test <-  ggplot(raw_test_data, aes(x=default)) +
        geom_bar(fill = "#e08926") + ggtitle("Default Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(default_train, default_test, ncol=2)

Housing

housing_train <- ggplot(raw_train_data, aes(x=housing)) +
        geom_bar(fill ="#3526e0") + ggtitle("Housing Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() +    theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

housing_test <-  ggplot(raw_test_data, aes(x=housing)) +
        geom_bar(fill = "#e08926") + ggtitle("Housing Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(housing_train, housing_test, ncol=2)

Loan

loan_train <- ggplot(raw_train_data, aes(x=loan)) +
        geom_bar(fill ="#3526e0") + ggtitle("Loan Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

loan_test <-  ggplot(raw_test_data, aes(x=loan)) +
        geom_bar(fill = "#e08926") + ggtitle("Loan Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(loan_train, loan_test, ncol=2)

Contact

contact_train <- ggplot(raw_train_data, aes(x=contact)) +
        geom_bar(fill ="#3526e0") + ggtitle("Contact Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

contact_test <-  ggplot(raw_test_data, aes(x=contact)) +
        geom_bar(fill = "#e08926") + ggtitle("Contact Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(contact_train, contact_test, ncol=2)

Month

month_train <- ggplot(raw_train_data, aes(x=month)) +
        geom_bar(fill ="#3526e0") + ggtitle("Month Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

month_test <-  ggplot(raw_test_data, aes(x=month)) +
        geom_bar(fill = "#e08926") + ggtitle("Month Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(month_train, month_test, ncol=2)

Previous campaign

poutcome_train <- ggplot(raw_train_data, aes(x=poutcome)) +
        geom_bar(fill ="#3526e0") + ggtitle("poutcome Distribution - Train") + 
        labs(y = "", x="") + theme_minimal() + theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

poutcome_test <-  ggplot(raw_test_data, aes(x=poutcome)) +
        geom_bar(fill = "#e08926") + ggtitle("poutcome Distribution - Test") + 
        labs(y = "", x="") + theme_minimal() +  theme(text = element_text(face = "plain"),
            legend.position = "none",
            axis.text.x = element_text(size = 7, angle = 90),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
           plot.title = element_text(hjust = 0.5)) 

grid.arrange(poutcome_train, poutcome_test, ncol=2)

5. Correlation plot

correlation <- cor(cont_var)
corrplot(correlation, method ="number", type = "upper")

6. Outlier Analysis

Outlier Analysis for continuous and discrete variables

Continuous Variables

cont_box <- ggplot(cont_var_melt, aes(variable, value)) +
          geom_boxplot(aes(fill = variable)) +
          coord_flip() +                                
          scale_fill_tableau() +
          labs(x = "", y = "") +
          theme_minimal() +
          theme(text = element_text(face = "bold"),
                legend.position = "none",
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
               plot.title = element_text(hjust = 0.5),
               axis.text.x = element_blank())                                
                            
                      
cont_box

Continuous Scaled Variables

df_cont_norm <- raw_train_data[,..continuous_var]
df_cont_norm <- as.data.frame(apply(df_cont_norm, 2,function(x)((x - min(x))/(max(x)-min(x)))))
df_cont_norm <- as.data.frame(melt(df_cont_norm))

cont_box_norm <- ggplot(df_cont_norm, aes(variable, value)) +
              geom_boxplot(aes(fill = variable)) +
              coord_flip() +                                
              scale_fill_tableau() +
              labs(x = "", y = "") +
              theme_minimal() +
              theme(text = element_text(face = "bold"),
                    legend.position = "none",
                    panel.grid.major = element_blank(),
                    panel.grid.minor = element_blank(),
                   plot.title = element_text(hjust = 0.5),
                   axis.text.x = element_blank())                                
                            
                      
cont_box_norm

Discrete Variables

disc_box <- ggplot(df_disc, aes(Var2, as.numeric(value))) +
                geom_boxplot(aes(fill = Var2)) +
                scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
                scale_x_discrete(expand = c(0,0)) +
                facet_wrap(~Var2, scales = "free", ncol = 1) +
                scale_fill_tableau() +
                ggtitle("Distribution of each discrete variable") +
                labs(fill = "", x = "", y = "") +
                coord_flip() +
                theme_light() +
                theme(text = element_text(face = "bold"),
                      legend.position = "none",
                      axis.text.x = element_blank(),
                      panel.grid.major = element_blank(),
                      panel.grid.minor = element_blank(),
                      plot.title = element_text(hjust = 0.5),
                      strip.background = element_blank(),
                      strip.text.x = element_blank())

disc_box

7. Target distribution across features

7.1 Target distribution across continuous features

Age

ggplot(raw_train_data, aes(x=age, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("Age Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

Balance

ggplot(raw_train_data, aes(x=balance, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("Balance Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

Day

ggplot(raw_train_data, aes(x=day, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("Day Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

Duration

ggplot(raw_train_data, aes(x=duration, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("Duration Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

Campaign

ggplot(raw_train_data, aes(x=campaign, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("Campaign Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

pdays

ggplot(raw_train_data, aes(x=pdays, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("pdays Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

previous

ggplot(raw_train_data, aes(x=previous, color = y)) +
        geom_density(alpha = 0.7) +
        ggtitle("Previous Target Distribution") + theme_classic() +
        scale_color_manual(values=c("#995052", "#529950"))

7.2 Target distribution across discrete features

Job

 ggplot(raw_train_data,mapping = aes(job,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Marital

ggplot(raw_train_data,mapping = aes(marital,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Education

ggplot(raw_train_data,mapping = aes(education,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Default

ggplot(raw_train_data,mapping = aes(default,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Housing

ggplot(raw_train_data,mapping = aes(housing,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Loan

ggplot(raw_train_data,mapping = aes(loan,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Contact

ggplot(raw_train_data,mapping = aes(contact,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Month

ggplot(raw_train_data,mapping = aes(month,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)

Previous campaign

ggplot(raw_train_data,mapping = aes(poutcome,fill=y))+
  geom_bar(col="black")+
  geom_text(stat="count",aes(label=..count..),
            position=position_stack(0.5), color="white", size=3)